# MDW redo of post analysis

options(show.signif.stars=FALSE,show.coef.Pvalues=FALSE,scipen=200)

library(verification)

library(ROCR)



dd <- c("C:/Documents and Settings/mward/Desktop/50percentCV") 

setwd(dd)



dat<-dget("data9000")

Xd<-dat[[2]]

Y0<-dat[[1]] ; diag(Y0)<-NA  ; Y0<-1*(Y0>0) # makes binary

Yd<-Y0

Ytt<-dget(paste("Y.K", 3, ".p0.5", sep=""))

out<-dget(paste("Resultsk", 3, "/OUT.K", 3, ".p0.5", sep=""))

EY<-dget(paste("Resultsk", 3, "/EY.pm.K", 3, ".p0.5", sep=""))



n<-dim(Yd)[1]

Yd.t<-NULL

for(i in 1:n){

for(j in (1:n)[-i] ){yd.t<-c(rownames(Yd)[i], colnames(Yd)[j], Yd[i,j])

                     Yd.t<-rbind(Yd.t, yd.t)}}

Ytt.t<-NULL

for(i in 1:n){

for(j in (1:n)[-i] ){ytt.t<-c(rownames(Ytt)[i], colnames(Ytt)[j],Ytt[i,j])

                     Ytt.t<-rbind(Ytt.t, ytt.t)}}

EY.t<-NULL

for(i in 1:n){

for(j in (1:n)[-i] ){ey.t<-c(rownames(EY)[i], colnames(EY)[j],EY[i,j])

                     EY.t<-rbind(EY.t, ey.t)}}

Pred<-cbind(Yd.t[,c(1,2, 3)], Ytt.t[,3], EY.t[,3]); Pred<-as.data.frame(Pred)





rownames(Pred)<-1:(n*(n-1))

colnames(Pred)<-c("sender", "receiver", "yd", "yd.test", "yd.pred")

Pred$yd<-as.numeric(as.character(Pred$yd))

Pred$yd.test<-as.numeric(as.character(Pred$yd.test))

Pred$yd.pred<-as.numeric(as.character(Pred$yd.pred))

summary(Pred$yd)





Pred.test<-Pred[is.na(Pred$yd.test),]

Pred.test$yd<-as.numeric(as.character(Pred.test$yd))

Pred.test$yd.test<-as.numeric(as.character(Pred.test$yd.test))

Pred.test$yd.pred<-as.numeric(as.character(Pred.test$yd.pred))



Train.test<-Pred[!is.na(Pred$yd.test),]

Train.test$yd<-as.numeric(as.character(Train.test$yd))

Train.test$yd.test<-as.numeric(as.character(Train.test$yd.test))

Train.test$yd.pred<-as.numeric(as.character(Train.test$yd.pred))



# now for generic logit

### simple glm: # Where does this data come from, and why not simply use data9000?

# this code is wrong and is not comparable, as it doesn't seem to separate things into

# test and training sets.... sigh



glmdat<-dget("glmdata")

glmdat<-glmdat[!is.na(glmdat[,1]),] # get rid of the NAs from the diagnol

glmfit<-glm(glmdat[,2]~glmdat[, -c(1,2,3)],family="binomial") # run glm based on the training set

summary(glmfit)

# data.test<-na.omit(glmdat) # this is wrong, it gets training, not test

# THIS CODE IS WRONG.

data.test<-glmdat[is.na(glmdat[,2]),] # this is the test set.

data.train<-glmdat[!is.na(glmdat[,2]),] # this is the training set.

dim(data.test)

pred.test<-(data.test[,-c(1,2)])%*%c(glmfit$coef)

p.test<-exp(pred.test)/(1+exp(pred.test))

pred.on<-cbind(data.test[,1], pred.test, p.test)

Pred.On<-pred.on[order(pred.on[,3], decreasing = T),]

# there are 32 out of top 100 correctly predicted. 



pred.train<-(data.train[,-c(1,2)])%*%c(glmfit$coef)

p.train<-exp(pred.train)/(1+exp(pred.train))

pred.on.train<-cbind(data.train[,1], pred.train, p.train)

Pred.On.train<-pred.on[order(pred.on.train[,3], decreasing = T),]

# there are 28 out of top 100 correctly predicted. 



brier.test<-brier( Pred.test$yd , Pred.test$yd.pred )$bs

brier.train<-brier( Train.test$yd,Train.test$yd.pred )$bs



# now let's do a roc plot



rocr.pred<-prediction(Pred.test$yd.pred,Pred.test$yd)

rocr.perf<-performance(rocr.pred,"tpr","fpr")

rocr2.pred<-prediction(Train.test$yd.pred,Train.test$yd)

rocr2.perf<-performance(rocr2.pred,"tpr","fpr")

rocr3.pred<-prediction(fitted(glmfit),glmfit$y)

rocr3.perf<-performance(rocr3.pred,"tpr","fpr")

rocr4.pred<-prediction(p.test,pred.on[,1])

rocr4.perf<-performance(rocr4.pred,"tpr","fpr")

plot(rocr.perf,las=1,col="blue")

plot(rocr2.perf,add=T,col="lightgreen")

plot(rocr3.perf,add=T,col="red")

plot(rocr4.perf,add=T,col="purple") # SOMETHING WRONG HERE.....



plot(1:100, 1:100, ylim=c(1,100), type="n", xlab="Number of Out of Sample Links Checked", ylab="Discovered Links",

las=1,bty="n")



Pred.test<-Pred.test[order(as.numeric(as.character(Pred.test$yd.pred)), decreasing = T),]

sum(as.numeric(as.character(Pred.test$yd)))

sum(as.numeric(as.character(Pred.test$yd[1:100])))

top100<-Pred.test[1:100,]

Found<-NULL

found<-0

for (i in 1:100){if (top100$yd[i]==1){found<-found+1; Found<-c(Found, found)}

                 else {found<-found+0; Found<-c(Found, found)}

                 }

# plot(1:100, Found, ylim=c(1,100), type="l", xlab="number checked", ylab="number found", lwd=2)

Found

lines(1:100, Found, col=3+1, lwd=2)

 

lines(1:100, 1:100, lty=2)





### simple glm: # Where does this data come from, and why not simply use data9000?

glmdat<-dget("glmdata")

glmfit<-glm(glmdat[,2]~glmdat[, -c(1,2,3)],family="binomial")

summary(glmfit)

data.test<-na.omit(glmdat)

pred.test<-(data.test[,-c(1,2)])%*%c(glmfit$coef)

p.test<-exp(pred.test)/(1+exp(pred.test))

pred.on<-cbind(data.test[,1], pred.test, p.test)

Pred.On<-pred.on[order(pred.on[,3], decreasing = T),]

Found<-NULL

found<-0

for (i in 1:100){if (Pred.On[i, 1]==1){found<-found+1; Found<-c(Found, found)}

                 else {found<-found+0; Found<-c(Found, found)}

                 }

# plot(1:100, Found, ylim=c(1,100), type="l", xlab="number checked", ylab="number found", lwd=2)

Found

lines(1:100, Found, col="red", lwd=1, lty=2)

text(80,50,"LS Model",col="blue")

text(80,14,"Standard Logit",col="red")
